library(tidyverse)
library(corrplot)
library("PerformanceAnalytics")
library(gtsummary)
library(flextable)
library(caret)
library(glmnet)
library(e1071)
food_data <- read_csv("Food_Supply_kcal_Data.csv")
head(food_data)
## # A tibble: 6 x 32
##   Country    `Alcoholic Bever… `Animal Product… `Animal fats` `Aquatic Products…
##   <chr>                  <dbl>            <dbl>         <dbl>              <dbl>
## 1 Afghanist…            0                  4.78         0.850                  0
## 2 Albania               0.912             16.1          1.06                   0
## 3 Algeria               0.0896             6.03         0.194                  0
## 4 Angola                1.94               4.69         0.264                  0
## 5 Antigua a…            2.30              15.4          1.54                   0
## 6 Argentina             1.44              15.0          1.06                   0
## # … with 27 more variables: Cereals - Excluding Beer <dbl>, Eggs <dbl>,
## #   Fish, Seafood <dbl>, Fruits - Excluding Wine <dbl>, Meat <dbl>,
## #   Milk - Excluding Butter <dbl>, Miscellaneous <dbl>, Offals <dbl>,
## #   Oilcrops <dbl>, Pulses <dbl>, Spices <dbl>, Starchy Roots <dbl>,
## #   Stimulants <dbl>, Sugar Crops <dbl>, Sugar & Sweeteners <dbl>,
## #   Treenuts <dbl>, Vegetal Products <dbl>, Vegetable Oils <dbl>,
## #   Vegetables <dbl>, Obesity <dbl>, Undernourished <chr>, Confirmed <dbl>,
## #   Deaths <dbl>, Recovered <dbl>, Active <dbl>, Population <dbl>,
## #   Unit (all except Population) <chr>
sum(food_data[1,2:24])
## [1] 100.0002
food_data <- food_data  %>%
  drop_na() %>%
  mutate(Confirmed_per_capita = Confirmed/Population * 100000000, Deaths_per_capita = Deaths/Population * 100000000) %>%
  #creating a column that specifies whether each country falls above or below the median
  mutate(Cases_vs_median = as.character(ifelse(Confirmed_per_capita > median(Confirmed_per_capita), "Above", "Below")), 
         Deaths_vs_median = as.character(ifelse(Deaths_per_capita > median(Deaths_per_capita), "Above", "Below")))

food_data <- food_data %>%
  select(-Miscellaneous, -Obesity, -Undernourished, -Recovered, -Active, -Population, -`Unit (all except Population)`, 
         -Confirmed, -Deaths)
head(food_data)
## # A tibble: 6 x 27
##   Country   `Alcoholic Bevera… `Animal Product… `Animal fats` `Aquatic Products…
##   <chr>                  <dbl>            <dbl>         <dbl>              <dbl>
## 1 Afghanis…             0                  4.78         0.850                  0
## 2 Albania               0.912             16.1          1.06                   0
## 3 Algeria               0.0896             6.03         0.194                  0
## 4 Angola                1.94               4.69         0.264                  0
## 5 Argentina             1.44              15.0          1.06                   0
## 6 Armenia               0.227             12.8          1.77                   0
## # … with 22 more variables: Cereals - Excluding Beer <dbl>, Eggs <dbl>,
## #   Fish, Seafood <dbl>, Fruits - Excluding Wine <dbl>, Meat <dbl>,
## #   Milk - Excluding Butter <dbl>, Offals <dbl>, Oilcrops <dbl>, Pulses <dbl>,
## #   Spices <dbl>, Starchy Roots <dbl>, Stimulants <dbl>, Sugar Crops <dbl>,
## #   Sugar & Sweeteners <dbl>, Treenuts <dbl>, Vegetal Products <dbl>,
## #   Vegetable Oils <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## #   Deaths_per_capita <dbl>, Cases_vs_median <chr>, Deaths_vs_median <chr>
corrplot(is.corr=TRUE, cor(food_data[,2:24], use="pairwise.complete.obs"), method="number")

chart.Correlation(food_data[,2:24], histogram=TRUE)

# summary statistics
food_data_1 <- food_data %>%
  select(-Deaths_vs_median, -Country)
tbl_summary(data = food_data_1, by = "Cases_vs_median",
            statistic = list(all_continuous() ~ "{mean} ({sd})",
                             all_categorical() ~ "{n}({p}%)")) %>%
  add_n() %>%
  add_p(test = all_continuous() ~ "aov") %>%
  as_flex_table() %>%
  bold(part = "header")
food_data_2 <- food_data %>%
  select(-Cases_vs_median, -Country)
tbl_summary(data = food_data_2, by = "Deaths_vs_median",
            statistic = list(all_continuous() ~ "{mean} ({sd})",
                             all_categorical() ~ "{n}({p}%)")) %>%
  add_n() %>%
  add_p(test = all_continuous() ~ "aov") %>%
  as_flex_table() %>%
  bold(part = "header")
# prepare data for cross validation
food_data <- food_data %>%
  # delete country since only each country has only one observation
  select(-`Animal Products`, -Country,-`Vegetal Products`)
food_data %>% head()
## # A tibble: 6 x 24
##   `Alcoholic Bevera… `Animal fats` `Aquatic Products,… `Cereals - Exclud…   Eggs
##                <dbl>         <dbl>               <dbl>              <dbl>  <dbl>
## 1             0              0.850                   0               37.1 0.150 
## 2             0.912          1.06                    0               16.2 0.809 
## 3             0.0896         0.194                   0               25.0 0.418 
## 4             1.94           0.264                   0               18.4 0.0441
## 5             1.44           1.06                    0               16.8 0.864 
## 6             0.227          1.77                    0               19.3 0.731 
## # … with 19 more variables: Fish, Seafood <dbl>, Fruits - Excluding Wine <dbl>,
## #   Meat <dbl>, Milk - Excluding Butter <dbl>, Offals <dbl>, Oilcrops <dbl>,
## #   Pulses <dbl>, Spices <dbl>, Starchy Roots <dbl>, Stimulants <dbl>,
## #   Sugar Crops <dbl>, Sugar & Sweeteners <dbl>, Treenuts <dbl>,
## #   Vegetable Oils <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## #   Deaths_per_capita <dbl>, Cases_vs_median <chr>, Deaths_vs_median <chr>
library(GGally)
# a panel of scatterplots using `GGally`
ggpairs(data = food_data[,c(1:20,23)],
        ggplot2::aes(colour = Cases_vs_median, alpha =0.5))

# numeric 
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Confirmed_per_capita, k=10)
result_lm <- list()
result_knn <- list()
result_lasso <- list()
result_ridge <- list()
      for(f in 1:length(cv_folds_10)){
        food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
        food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
        lm_fit = lm(Confirmed_per_capita~.-Deaths_per_capita-Cases_vs_median-Deaths_vs_median, data = food_data_train_cv_10)
        knn_fit = train(Confirmed_per_capita~.-Deaths_per_capita-Cases_vs_median-Deaths_vs_median, data = food_data_train_cv_10, 
                         method = "knn", tuneLength = 20, preProcess = c("scale", "center"),
                        trControl = trainControl(method="cv", number = 5))
        lasso_fit = cv.glmnet(x = data.matrix(food_data_train_cv_10[,c(1:20)]), y = data.matrix(food_data_train_cv_10[,21]), alpha=1)
        ridge_fit = cv.glmnet(x = data.matrix(food_data_train_cv_10[,c(1:20)]), y = data.matrix(food_data_train_cv_10[,21]), alpha=0)
        
        #linear
        food_data_test_cv_10$Confirmed_lm_predict <- predict(lm_fit, newdata = food_data_test_cv_10)
        result_lm[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_lm_predict)^2, na.rm=TRUE)
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_lm_predict)
        #knn
        food_data_test_cv_10$Confirmed_knn_predict <- predict(knn_fit, newdata = food_data_test_cv_10)
        result_knn[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_knn_predict)^2, na.rm=TRUE)
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_knn_predict)
        #lasso
        food_data_test_cv_10 = as.data.frame(food_data_test_cv_10)
        food_data_test_cv_10$Confirmed_lasso_predict <- as.matrix(cbind(1,food_data_test_cv_10[,c(1:20)])) %*% coef(lasso_fit, s = "lambda.min")
        result_lasso[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_lasso_predict)^2, na.rm=TRUE)
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_lasso_predict)        
        #ridge
        food_data_test_cv_10$Confirmed_ridge_predict <- as.matrix(cbind(1,food_data_test_cv_10[,c(1:20)])) %*% coef(ridge_fit, s = "lambda.min")
        result_ridge[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_ridge_predict)^2, na.rm=TRUE)
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_ridge_predict)
      }

data.frame_lm = data.frame("CV_error" = mean(unlist(result_lm)), "CV_error_SE" = sd(unlist(result_lm)))
data.frame_knn = data.frame("CV_error" = mean(unlist(result_knn)), "CV_error_SE" = sd(unlist(result_knn)))
data.frame_lasso = data.frame("CV_error" = mean(unlist(result_lasso)), "CV_error_SE" = sd(unlist(result_lasso)))
data.frame_ridge = data.frame("CV_error" = mean(unlist(result_ridge)), "CV_error_SE" = sd(unlist(result_ridge)))
rbind("Linear Regression" = data.frame_lm, "KNN" = data.frame_knn, "Lasso" = data.frame_lasso, "Ridge" = data.frame_ridge)
##                   CV_error CV_error_SE
## Linear Regression 64728.59    91521.07
## KNN               37346.76    44964.51
## Lasso             30592.48    32348.18
## Ridge             41715.88    45982.01
food_data <- food_data %>%
  mutate(Cases_vs_median = as.factor(ifelse(Cases_vs_median == "Above",1,0)),
         Deaths_vs_median = as.factor(ifelse(Deaths_vs_median == "Above",1,0)))
# categorical -> predict cases
library(rBayesianOptimization)
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Cases_vs_median, k=10)
result_logistic <- list()
result_knn <- list()
result_linear_svm <- list()
result_radial_svm <- list()
result_polynomial_svm <- list()
result_qda <- list()
result_lasso <- list()
result_ridge <- list()
food_data %>% head()
## # A tibble: 6 x 24
##   `Alcoholic Bevera… `Animal fats` `Aquatic Products,… `Cereals - Exclud…   Eggs
##                <dbl>         <dbl>               <dbl>              <dbl>  <dbl>
## 1             0              0.850                   0               37.1 0.150 
## 2             0.912          1.06                    0               16.2 0.809 
## 3             0.0896         0.194                   0               25.0 0.418 
## 4             1.94           0.264                   0               18.4 0.0441
## 5             1.44           1.06                    0               16.8 0.864 
## 6             0.227          1.77                    0               19.3 0.731 
## # … with 19 more variables: Fish, Seafood <dbl>, Fruits - Excluding Wine <dbl>,
## #   Meat <dbl>, Milk - Excluding Butter <dbl>, Offals <dbl>, Oilcrops <dbl>,
## #   Pulses <dbl>, Spices <dbl>, Starchy Roots <dbl>, Stimulants <dbl>,
## #   Sugar Crops <dbl>, Sugar & Sweeteners <dbl>, Treenuts <dbl>,
## #   Vegetable Oils <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## #   Deaths_per_capita <dbl>, Cases_vs_median <fct>, Deaths_vs_median <fct>
food_data$Cases_vs_median <- as_factor(food_data$Cases_vs_median)
food_data$Deaths_vs_median <- as_factor(food_data$Deaths_vs_median)

      for(f in 1:length(cv_folds_10)){
        food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
        food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
        # knn
        set.seed(2342)
        knn_fit <- train(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, 
                         method = "knn", tuneLength = 20, preProcess = c("scale", "center"),
                        trControl = trainControl(method="cv", number = 5))
        # logistic
        logistic_fit <- glm(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, family = binomial())
        # SVM - linear
        set.seed(2342)
        svr_linear_tune <- tune(svm, Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, kernel ="linear", ranges=list(cost=1:3, epsilon=c(0.1,0.5,1)))
        linear_svm_fit <- svr_linear_tune$best.model
        # SVM - radial
        set.seed(2342)
        svr_radial_tune <- tune(svm, Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, kernel ="radial", ranges=list(gamma=c(0.001, 0.05, 0.1), cost=1:3, epsilon=c(0.1,0.5,1)))
        radial_svm_fit <- svr_radial_tune$best.model
        # SVM - polynomial
        set.seed(2342)
        svr_polynomial_tune <- tune(svm, Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, kernel ="polynomial", ranges=list(gamma=c(0.001, 0.05, 0.1), cost=1:3, d=seq(2,3)))
        polynomial_svm_fit <- svr_polynomial_tune$best.model
        # QDA
        frmla <-as.formula(paste0(names(food_data_train_cv_10)[23], "~", paste0("`", names(food_data_train_cv_10)[-c(3,21:24)], "`", collapse="+")))
        qda_fit <- train(frmla, data = food_data_train_cv_10, method = "qda")
        # lasso & ridge
        lasso_fit = cv.glmnet(x = data.matrix(food_data_train_cv_10[,c(1:20)]), y = data.matrix(food_data_train_cv_10[,23]), alpha=1, family="binomial")
        ridge_fit = cv.glmnet(x = data.matrix(food_data_train_cv_10[,c(1:20)]), y = data.matrix(food_data_train_cv_10[,23]), alpha=0, family="binomial")
        
        # knn
        food_data_test_cv_10$Confirmed_knn_predict <- predict(knn_fit, newdata = food_data_test_cv_10)
        result_knn[[f]] <- 1-confusionMatrix(data = food_data_test_cv_10$Confirmed_knn_predict, 
                                             reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_knn_predict)
        #logistic
        food_data_test_cv_10$Confirmed_logistic_predict_num <- predict(logistic_fit, newdata = food_data_test_cv_10, type="response", na.action=na.exclude)
        food_data_test_cv_10$Confirmed_logistic_predict <- as_factor(ifelse(food_data_test_cv_10$Confirmed_logistic_predict_num >= 0.5, "1", "0"))
        result_logistic[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_logistic_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_logistic_predict, -Confirmed_logistic_predict_num)
        # linear svm
        food_data_test_cv_10$Confirmed_linear_svm_predict <- predict(linear_svm_fit, newdata = food_data_test_cv_10)
        result_linear_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_linear_svm_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_linear_svm_predict)
        # radial svm
        food_data_test_cv_10$Confirmed_radial_svm_predict <- predict(radial_svm_fit, newdata = food_data_test_cv_10)
        result_radial_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_radial_svm_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_radial_svm_predict)
        # polynomial svm
        food_data_test_cv_10$Confirmed_polynomial_svm_predict <- predict(polynomial_svm_fit, newdata = food_data_test_cv_10)
        result_polynomial_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_polynomial_svm_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_polynomial_svm_predict)
        # qda
        food_data_test_cv_10$Confirmed_qda_predict_num <- predict(qda_fit, newdata = food_data_test_cv_10, type = "prob", na.action=na.exclude)$`1`
        food_data_test_cv_10$Confirmed_qda_predict = relevel(factor(ifelse(food_data_test_cv_10$Confirmed_qda_predict_num > 0.5, "1", "0")), ref = "0")
        result_qda[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_qda_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_qda_predict)
        # Lasso
              # probability of being in class 1
        food_data_test_cv_10$Confirmed_lasso_predict_num <- predict(lasso_fit, newx=data.matrix(food_data_test_cv_10[,c(1:20)]), type="response")
        food_data_test_cv_10$Confirmed_lasso_predict <- as_factor(ifelse(food_data_test_cv_10$Confirmed_lasso_predict_num >= 0.5, "1", "0"))
        result_lasso[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_lasso_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_lasso_predict, -Confirmed_lasso_predict_num)
        # Ridge
        food_data_test_cv_10$Confirmed_ridge_predict_num <- predict(ridge_fit, newx=data.matrix(food_data_test_cv_10[,c(1:20)]), type="response")
        food_data_test_cv_10$Confirmed_ridge_predict <- as_factor(ifelse(food_data_test_cv_10$Confirmed_ridge_predict_num >= 0.5, "1", "0"))
        result_ridge[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_ridge_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_ridge_predict, -Confirmed_ridge_predict_num)
      }

data.frame_knn = data.frame("CV_error" = mean(unlist(result_knn)), "CV_error_SE" = sd(unlist(result_knn)))
data.frame_logistic = data.frame("CV_error" = mean(unlist(result_logistic)), "CV_error_SE" = sd(unlist(result_logistic)))
data.frame_linear_svm = data.frame("CV_error" = mean(unlist(result_linear_svm)), "CV_error_SE" = sd(unlist(result_linear_svm)))
data.frame_radial_svm = data.frame("CV_error" = mean(unlist(result_radial_svm)), "CV_error_SE" = sd(unlist(result_radial_svm)))
data.frame_polynomial_svm = data.frame("CV_error" = mean(unlist(result_polynomial_svm)), "CV_error_SE" = sd(unlist(result_polynomial_svm)))
data.frame_qda = data.frame("CV_error" = mean(unlist(result_qda)), "CV_error_SE" = sd(unlist(result_qda)))
data.frame_lasso = data.frame("CV_error" = mean(unlist(result_lasso)), "CV_error_SE" = sd(unlist(result_lasso)))
data.frame_ridge = data.frame("CV_error" = mean(unlist(result_ridge)), "CV_error_SE" = sd(unlist(result_ridge)))

summary_table <- 
  rbind("KNN" = data.frame_knn, "Logistic Regression" = data.frame_logistic, "Linear SVM" = data.frame_linear_svm, 
        "Radial SVM" = data.frame_radial_svm, "Polynomial SVM" = data.frame_polynomial_svm, "QDA" = data.frame_qda, 
        "Lasso" = data.frame_lasso, "Ridge" = data.frame_ridge)
flextable(summary_table %>% rownames_to_column("Algorithm Name"))
###### lasso best model
lasso_best <- cv.glmnet(x = data.matrix(food_data[,c(1:20)]), y = data.matrix(food_data[,23]), alpha=1)
coef(lasso_best, s = "lambda.min")
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                                    1
## (Intercept)               1.01933817
## Alcoholic Beverages       0.04015531
## Animal fats               .         
## Aquatic Products, Other   .         
## Cereals - Excluding Beer -0.00113986
## Eggs                      .         
## Fish, Seafood             .         
## Fruits - Excluding Wine   0.01518515
## Meat                      .         
## Milk - Excluding Butter   0.04491741
## Offals                    .         
## Oilcrops                  .         
## Pulses                    .         
## Spices                    .         
## Starchy Roots             .         
## Stimulants                0.30721984
## Sugar Crops              -0.09814204
## Sugar & Sweeteners        0.04135097
## Treenuts                  .         
## Vegetable Oils            .         
## Vegetables                .
library(randomForest)
# random forest
names(food_data) <- make.names(names(food_data))
food_data %>% head()
## # A tibble: 6 x 24
##   Alcoholic.Beverag… Animal.fats Aquatic.Products..… Cereals...Excluding…   Eggs
##                <dbl>       <dbl>               <dbl>                <dbl>  <dbl>
## 1             0            0.850                   0                 37.1 0.150 
## 2             0.912        1.06                    0                 16.2 0.809 
## 3             0.0896       0.194                   0                 25.0 0.418 
## 4             1.94         0.264                   0                 18.4 0.0441
## 5             1.44         1.06                    0                 16.8 0.864 
## 6             0.227        1.77                    0                 19.3 0.731 
## # … with 19 more variables: Fish..Seafood <dbl>, Fruits...Excluding.Wine <dbl>,
## #   Meat <dbl>, Milk...Excluding.Butter <dbl>, Offals <dbl>, Oilcrops <dbl>,
## #   Pulses <dbl>, Spices <dbl>, Starchy.Roots <dbl>, Stimulants <dbl>,
## #   Sugar.Crops <dbl>, Sugar...Sweeteners <dbl>, Treenuts <dbl>,
## #   Vegetable.Oils <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## #   Deaths_per_capita <dbl>, Cases_vs_median <fct>, Deaths_vs_median <fct>
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Cases_vs_median, k=10)
result <- list()
result_rf <- list()
n_list <- c(50, 250, 500)
p <- ncol(food_data) - 4
c(p/2, sqrt(p), p)
## [1] 10.000000  4.472136 20.000000
m_list <- c(10, 4, 20)
reg_rf_tune <- list() 
reg_rf_oob_errors <- list()
round = 0

      for(f in 1:length(cv_folds_10)){
        food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
        food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
        for (i in 1:length(n_list)) {
            for (j in 1:length(m_list)) {
                round <- round + 1
                set.seed(7812)
                reg_rf_tune[[round]] <- randomForest(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, 
                                                     data = food_data_train_cv_10, ntree=n_list[i], mtry=m_list[j])
                head(food_data_train_cv_10)
                reg_rf_oob_errors[[round]] <- data.frame("trees_no"=n_list[i], "preds_no"=m_list[j], 
                                     "oob_errors"=reg_rf_tune[[round]]$err.rate[n_list[i],1])
                }
            }
        reg_rf_oob_errors_df <- do.call("rbind", reg_rf_oob_errors)
        best_errors <- which(reg_rf_oob_errors_df$oob_errors==min(reg_rf_oob_errors_df$oob_errors))
        set.seed(9984)
        reg_rf <- randomForest(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10,
                               ntree=reg_rf_oob_errors_df$trees_no[best_errors[1]], mtry=reg_rf_oob_errors_df$preds_no[best_errors[1]])
        Confirmed_Predict_Prob <- predict(reg_rf, newdata = food_data_test_cv_10, type = "prob")
        food_data_test_cv_10$Confirmed_rf_Predict <- colnames(Confirmed_Predict_Prob)[max.col(Confirmed_Predict_Prob, ties.method="first")]
        per_class_accuracy <- rep(NA,length(levels(food_data_test_cv_10$Cases_vs_median)))
        for (i in 1:length(per_class_accuracy)){
          per_class_accuracy[i] = food_data_test_cv_10 %>% 
          filter(Cases_vs_median == levels(Cases_vs_median)[i]) %>%
          summarise(accuracy = sum(Confirmed_rf_Predict == 
                                     levels(Cases_vs_median)[i]) / n()) %>%
          unlist()
          names(per_class_accuracy)[i] <- paste0("accuracy_", 
                                         levels(food_data_test_cv_10$Cases_vs_median)[i])
        }
        result[[f]] = per_class_accuracy
        result_rf[[f]] <- 1- confusionMatrix(data = as.factor(food_data_test_cv_10$Confirmed_rf_Predict),
                                             reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
      }
df = as.data.frame(matrix(unlist(result), ncol = 2, byrow = TRUE))
CV_error = c("Below" = (1-mean(df[,1], na.rm=TRUE)), 
             "Above" = (1-mean(df[,2], na.rm=TRUE)))

CV_error_SE = c("Below" = sd((1-df[,1]), na.rm=TRUE), 
                "Above" = sd((1-df[,2]), na.rm=TRUE))
data.frame(CV_error, CV_error_SE)
##        CV_error CV_error_SE
## Below 0.2142857   0.2032544
## Above 0.1785714   0.1491662
rf <- data.frame("CV Error" = CV_error, "CV Error SE" = CV_error_SE)
flextable(rf %>% rownames_to_column("Class Name"))
data.frame_rf <- data.frame("CV_error" = mean(unlist(result_rf)), "CV_error_SE" = sd(unlist(result_rf)))
summary_table <- rbind(summary_table, "Random Forest" = data.frame_rf)
summary_table
##                      CV_error CV_error_SE
## KNN                 0.2350595  0.08279182
## Logistic Regression 0.2051786  0.11602243
## Linear SVM          0.2386310  0.14153751
## Radial SVM          0.2484524  0.10199887
## Polynomial SVM      0.2502381  0.13471454
## QDA                 0.2573214  0.12212815
## Lasso               0.2016071  0.10989594
## Ridge               0.2279762  0.11053066
## Random Forest       0.1976190  0.09852584
flextable(summary_table %>% rownames_to_column("Algorithm Name"))
# best random forest - top 10 predictors
reg_rf_tune <- list() 
reg_rf_oob_errors <- list()
round = 0
for (i in 1:length(n_list)) {
            for (j in 1:length(m_list)) {
                round <- round + 1
                set.seed(7812)
                reg_rf_tune[[round]] <- randomForest(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, 
                                                     data = food_data, ntree=n_list[i], mtry=m_list[j])
                reg_rf_oob_errors[[round]] <- data.frame("trees_no"=n_list[i], "preds_no"=m_list[j], 
                                     "oob_errors"=reg_rf_tune[[round]]$err.rate[n_list[i],1])
                }
}
reg_rf_oob_errors_df <- do.call("rbind", reg_rf_oob_errors)
best_errors <- which(reg_rf_oob_errors_df$oob_errors==min(reg_rf_oob_errors_df$oob_errors))
set.seed(9984)
reg_rf <- randomForest(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data,
                               ntree=reg_rf_oob_errors_df$trees_no[best_errors[1]], mtry=reg_rf_oob_errors_df$preds_no[best_errors[1]])
varImpPlot(reg_rf, sort = TRUE, n.var = 10, main = "Top 10 Important Variables Predicting Cases vs Median")

varImpPlot(reg_rf, sort = TRUE)